Virtual idols originate from Japan, with roots in anime and Japanese idol culture, and dating back to the 1980s. In the last decade, vtubers or virtual youtubers have emerged due to the popularity of Internet and its devices like smartphones, PCs and 3D-detectors.
Vtuber in Bilibili platform (also named as vup) is an live-streaming entertainer who uses a virtual avatar generated using computer graphics and moved by real-time motion capture software. Danmakus shows hundreds of vups earned more than 0.1 million CNY every month and some can even earn 1 million CNY. Bilibili’s virtual idol is a market with a billion CNY level every year.
One question of interest is what factors influence income in vtuber live streams and how. We analyze the live data from different livers and aim to
illustrate the time-varing structure of vup market and
find the underlying relationship among live parameters.
Live data is from wandleshen@github.
They collect data with Danmakus API
from last 50 live streams of 50 livers each at
2023-02-10 16:42:54, all 2477 live streams. (Not every
liver has 50 live streams.) Vtuber information data are manually
collected from homepages, in which
liver,id,affiliation are from wandleshen@github and
sex,country,ip are from
myself.
df <- read.csv("data.csv")
vtb=read.csv("vtb2.csv")
str(vtb)
## 'data.frame': 50 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ liver : chr "七海Nana7mi" "中单光一" "内德维德" "弥希Miki" ...
## $ affiliation: chr "vr" "vr" "independent" "vr" ...
## $ id : int 434334701 434401868 90873 477317922 477342747 477306079 480680646 56748733 558070433 666726799 ...
## $ sex : chr "女" "男" "男" "女" ...
## $ country : chr "中国" "中国" "中国" "中国" ...
## $ ip : chr "上海" "上海" "新疆" "江苏" ...
vtb$X=NULL
vtb$sex=as.factor(vtb$sex)
vtb$ip=as.factor(vtb$ip)
vtb$country=as.factor(vtb$country)
library(GGally)
library(pheatmap)
library(reshape2)
library(dplyr)
library(ggplot2)
library(lubridate)
library(extrafont)
library(marginaleffects)
library(MatchIt)
library(grf)
library(randomForest)
#font_import()
#fonts()
str(df)
## 'data.frame': 2477 obs. of 17 variables:
## $ liver : chr "-阿蕊娅Aria-" "-阿蕊娅Aria-" "-阿蕊娅Aria-" "-阿蕊娅Aria-" ...
## $ area : chr "虚拟Singer" "虚拟Singer" "虚拟主播" "虚拟主播" ...
## $ danmakusCount : int 2627 3762 2167 4039 1393 1418 3383 3101 3699 5789 ...
## $ startDate : num 1.68e+12 1.68e+12 1.68e+12 1.68e+12 1.68e+12 ...
## $ stopDate : num 1.68e+12 1.68e+12 1.68e+12 1.68e+12 1.68e+12 ...
## $ timeDuration : int 11054514 11844848 8461954 14922527 46881886 7544343 11505659 10361140 11247584 15638131 ...
## $ title : chr "下面我来简单唱两句" "赫赫,我来简单唱两句" "早上好!唱一会儿~" "歌杂~" ...
## $ totalIncome : num 1754 11057 3852 8225 666 ...
## $ watchCount : int 4078 4928 3382 4455 5452 2698 4348 4544 4997 6103 ...
## $ interactionCount : int 468 649 451 575 355 317 530 501 550 772 ...
## $ superchatCount : int 16 12 7 19 3 10 12 14 24 19 ...
## $ superchatIncome : int 680 597 230 3276 90 428 430 1480 930 890 ...
## $ superchatTimestamps : chr "[1675954450657, 1675955951334, 1675957817542, 1675957914753, 1675958621817, 1675958692099, 1675958916597, 16759"| __truncated__ "[1675864937946, 1675865846334, 1675866462953, 1675866781825, 1675867310789, 1675869256286, 1675871757716, 16758"| __truncated__ "[1675728302746, 1675729527578, 1675730123898, 1675734044137, 1675734117096, 1675734908071, 1675735637727]" "[1675605761897, 1675606540268, 1675606864640, 1675607128304, 1675607144758, 1675607910721, 1675608046866, 16756"| __truncated__ ...
## $ membershipCount : int 0 11 9 6 3 2 13 8 22 4 ...
## $ membershipIncome : int 0 7358 1242 2688 414 276 5114 1164 26134 16412 ...
## $ membershipTimestamps: chr "[]" "[1675868606293, 1675868631905, 1675869343361, 1675869438134, 1675869457256, 1675869889257, 1675870029686, 16758"| __truncated__ "[1675727902618, 1675728119437, 1675728413698, 1675728525204, 1675733327041, 1675733557159, 1675733721169, 16757"| __truncated__ "[1675608154614, 1675609365272, 1675609368458, 1675613019747, 1675615266252, 1675615911612]" ...
## $ followers : int 34583 34583 34583 34583 34583 34583 34583 34583 34583 34583 ...
summary(df)
## liver area danmakusCount startDate
## Length:2477 Length:2477 Min. : 0 Min. :1.652e+12
## Class :character Class :character 1st Qu.: 2251 1st Qu.:1.672e+12
## Mode :character Mode :character Median : 5227 Median :1.674e+12
## Mean : 11817 Mean :1.673e+12
## 3rd Qu.: 11989 3rd Qu.:1.675e+12
## Max. :210878 Max. :1.676e+12
## stopDate timeDuration title totalIncome
## Min. :1.652e+12 Min. :-26494026 Length:2477 Min. : 0.0
## 1st Qu.:1.672e+12 1st Qu.: 7642526 Class :character 1st Qu.: 542.9
## Median :1.674e+12 Median : 10096143 Mode :character Median : 1550.4
## Mean :1.673e+12 Mean : 11152512 Mean : 5491.7
## 3rd Qu.:1.675e+12 3rd Qu.: 14005235 3rd Qu.: 4392.5
## Max. :1.676e+12 Max. : 70100660 Max. :450768.7
## watchCount interactionCount superchatCount superchatIncome
## Min. : 0 Min. : 0 Min. : 0.00 Min. : 0
## 1st Qu.: 5000 1st Qu.: 499 1st Qu.: 2.00 1st Qu.: 90
## Median : 10595 Median : 1108 Median : 9.00 Median : 352
## Mean : 21753 Mean : 2131 Mean : 25.05 Mean : 1338
## 3rd Qu.: 27410 3rd Qu.: 2644 3rd Qu.: 27.00 3rd Qu.: 1170
## Max. :265166 Max. :25175 Max. :1724.00 Max. :96989
## superchatTimestamps membershipCount membershipIncome membershipTimestamps
## Length:2477 Min. : 0.00 Min. : 0 Length:2477
## Class :character 1st Qu.: 1.00 1st Qu.: 138 Class :character
## Mode :character Median : 3.00 Median : 474 Mode :character
## Mean : 12.75 Mean : 3020
## 3rd Qu.: 8.00 3rd Qu.: 1716
## Max. :2173.00 Max. :386712
## followers
## Min. : 34583
## 1st Qu.: 115577
## Median : 263239
## Mean : 354179
## 3rd Qu.: 486642
## Max. :1339125
ggplot(data = df, aes(x = (startDate-min(startDate))/8.64e07)) + geom_histogram(binwidth=1)+
xlab("Day")
ggplot(data = df, aes(x = timeDuration/3.6e06)) + geom_histogram(binwidth=1)+
xlab("Hour")
Transform the date type and remove streams which
timeDuration<0.
df_tf=mutate(df, posix_startDate = with_tz(as.POSIXct(startDate / 1000, origin = "1970-01-01", tz = "UTC"), "Asia/Shanghai"),
posix_stopDate = with_tz(as.POSIXct(stopDate / 1000, origin = "1970-01-01", tz = "UTC"), "Asia/Shanghai"))
df_tf=df_tf%>%
mutate(weekday=factor(weekdays(posix_startDate), levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")),timeDuration=timeDuration/3.6e06)%>%
filter(timeDuration>0)%>%
merge(vtb)
ggplot(melt(df_tf[,c(3,6,8,9,10,11,12,14,15,18)],id="posix_startDate"),
aes(x = posix_startDate,y = value) )+
geom_line()+
facet_wrap(~variable,scales = "free_y",ncol=3)
Plot the time series and find three peaks at about
2022-10-30, Christmas Eve and Spring Festival.
df_tf%>%arrange(desc(membershipIncome))%>%
transmute(liver,title,membershipIncome,totalIncome,posix_startDate)%>%head()
## liver title membershipIncome
## 1 美月もも 【美月もも】出道首播 386712
## 2 阿梓从小就很可爱 晚好晚好晚好 362044
## 3 雫るる_Official LULU3D公布回!让你心动 288424
## 4 阿萨Aza 除夕快乐啊啊啊啊啊! 185988
## 5 雾深Girimi 【新衣回】超可爱的熊猫不是猫吗;; 154416
## 6 阿萨Aza 又是一年平安夜! 134574
## totalIncome posix_startDate
## 1 450768.7 2022-10-30 18:56:52
## 2 403178.9 2022-12-20 19:01:26
## 3 335778.0 2023-01-24 19:34:24
## 4 299294.0 2023-01-21 21:02:12
## 5 172216.0 2023-01-22 19:57:23
## 6 200934.5 2022-12-24 19:58:19
par(mfrow=c(1,2))
df_tf%>%
group_by(weekadays=factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))%>%
summarise(watchCount = median(watchCount),
interactionCount = median(interactionCount),
totalIncome = median(totalIncome),
danmakusCount = median(danmakusCount),
superchatIncome=median(superchatIncome),
membershipIncome=median(membershipIncome)
)%>%
plot(main="median per weekday")
df_tf%>%
group_by(weekdays=factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))%>%
summarise(watchCount = sum(watchCount),
interactionCount = sum(interactionCount),
totalIncome = sum(totalIncome),
danmakusCount = sum(danmakusCount),
superchatIncome=sum(superchatIncome),
membershipIncome=sum(membershipIncome)
)%>%
plot(main="sum per weekday")
Saturday is the best day and Sunday is the second. I guess mang viewers work till Saturday evening.
df_tf%>%
transmute(danmakusCount,interactionCount,timeDuration,totalIncome,superchatCount,superchatIncome,membershipCount,membershipIncome,followers)%>%
cor()%>%
pheatmap()
Every two variables have positive correlation. hclust
(Hierarchical Clustering) shows that three clusters:
followers, timeDuration,
totalIncome,membershipCount,membershipIncome and
superchatCount,superchatIncome,danmakusCount,interactionCount
.
ggplot(df_tf%>%group_by(liver)%>%summarize(mean.totalIncome=mean(totalIncome),mean.watchCount=mean(watchCount),followers=mean(followers), affiliation=affiliation[1], mean.danmakusCount=mean(danmakusCount), mean.superchatCount=mean(superchatCount),mean.membershipCount=mean(membershipCount), mean.superchatIncome=mean(superchatIncome), mean.membershipIncome=mean(membershipIncome) ), aes(x=log(mean.watchCount), y=log(mean.totalIncome), size = followers, color = affiliation)) +
geom_point(alpha=0.6)+
geom_text(aes(label = liver),hjust=0, vjust=0,family="Songti SC")+
theme_bw(base_family = "Songti SC")+
scale_colour_brewer(palette="Set1")+
geom_smooth(method=lm,aes(color = NULL))+
guides(size = guide_legend(override.aes = list(linetype = 0)),
color = guide_legend(override.aes = list(linetype = 0)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Linear regression performs well
ggplot(df_tf%>%group_by(weekday)%>%summarize(mean.totalIncome=mean(totalIncome),mean.watchCount=mean(watchCount),followers=mean(followers), mean.danmakusCount=mean(danmakusCount), mean.superchatCount=mean(superchatCount),mean.membershipCount=mean(membershipCount), mean.superchatIncome=mean(superchatIncome), mean.membershipIncome=mean(membershipIncome) ), aes(x=log(mean.watchCount), y=log(mean.totalIncome), size = mean.membershipCount, color = mean.superchatCount)) +
geom_point(alpha=0.6)+
geom_smooth(method=lm,aes(color = NULL))+
geom_text(aes(label = weekday),hjust=0, vjust=0,family="Songti SC")+
scale_fill_viridis_c()+
guides(size = guide_legend(override.aes = list(linetype = 0)),
color = guide_legend(override.aes = list(linetype = 0)))
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(melt(data.frame(time=time_vec,
superchat=superchat_num,membership=membership_num,
start=start_num,stop=stop_num,liver=online_num),id="time"),
aes(x = time,y = value) )+
geom_path()+
facet_wrap(~variable,scales = "free_y",ncol=2)+
scale_x_continuous(breaks = seq(0, 24, by = 2))
There are peaks at the hours and the really high peak at
0:00. In addition, membershipCount keeps high
in 18:00-22:00 and superchatCount keeps high
in 20:00-24:00.
Most livers start at hours like 20:00 which
leads to peaks of membershipCount and
superchatCount. The peak at 0:00 is possibly
due to meaning of beginning and Bilibili automatic renewal.
The linear regression
log(totalIncome)~log(membersdhipCount) seems to work well.
However, we still have factor variables like ip,
country, liver, weekday which can
be exploited with tree methods and furthermore random forest. Neuron
networks can perhaps do a good job but we maybe lose all of
interpretability.
Bayesian hierarchical regression can also be used with two layers, first for liver and the second for live streams.
Some techniques in time series like ARIMA, VAR models can be used to model the time-varing data.
col_num<-unlist(lapply(df_tf, is.numeric))
col_num[c(4,5,22)]=FALSE
col_num
## liver area danmakusCount
## FALSE FALSE TRUE
## startDate stopDate timeDuration
## FALSE FALSE TRUE
## title totalIncome watchCount
## FALSE TRUE TRUE
## interactionCount superchatCount superchatIncome
## TRUE TRUE TRUE
## superchatTimestamps membershipCount membershipIncome
## FALSE TRUE TRUE
## membershipTimestamps followers posix_startDate
## FALSE TRUE FALSE
## posix_stopDate weekday affiliation
## FALSE FALSE FALSE
## id sex country
## FALSE FALSE FALSE
## ip
## FALSE
pca<-prcomp(log(df_tf[,col_num]+1))
pca
## Standard deviations (1, .., p=10):
## [1] 4.3859910 1.9775653 1.3326655 0.7648005 0.6608613 0.5496997 0.4207956
## [8] 0.3555255 0.3222402 0.1769085
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4 PC5
## danmakusCount 0.24060079 0.23069642 -0.28830970 -0.35968744 0.415100484
## timeDuration 0.03480811 0.02258354 -0.04564358 -0.15559743 0.139998427
## totalIncome 0.38407619 -0.01420061 0.08108918 -0.60589014 -0.671482260
## watchCount 0.20663207 0.23424574 -0.52543785 0.01450248 0.003667899
## interactionCount 0.21223524 0.21212031 -0.38890992 -0.05969538 0.138928938
## superchatCount 0.28047934 0.26573138 0.14920323 0.10043430 0.204527613
## superchatIncome 0.47751485 0.47101377 0.53594764 0.28632304 0.008519136
## membershipCount 0.22350211 -0.18175075 -0.01561185 -0.05525921 -0.013342268
## membershipIncome 0.58442558 -0.70930140 -0.04772469 0.22633560 0.185782466
## followers 0.08630751 0.13263650 -0.41092580 0.57374938 -0.511166938
## PC6 PC7 PC8 PC9 PC10
## danmakusCount -0.040905103 0.30152039 0.42739172 0.001956260 0.478360942
## timeDuration 0.112771669 -0.22191173 0.09273304 0.910086096 -0.231444465
## totalIncome 0.071029872 0.12883885 -0.07403416 0.014657683 -0.019403354
## watchCount 0.228685323 -0.52529817 -0.47734612 -0.081947390 0.251621895
## interactionCount -0.007682955 0.08147744 0.15444338 -0.262273993 -0.797814175
## superchatCount -0.369691934 0.45174185 -0.63201549 0.187657301 -0.033709345
## superchatIncome 0.194132479 -0.26531312 0.26085571 -0.067037130 0.014632597
## membershipCount -0.824153650 -0.45878201 0.14956725 -0.031844804 0.021060015
## membershipIncome 0.238920895 0.09800830 -0.02366265 0.001740982 0.008118608
## followers -0.132423933 0.25713060 0.24723311 0.235149398 0.124687105
plot(cumsum((pca$sdev))/sum(pca$sdev))
biplot(pca)
biplot(pca,c(1,3))
biplot(pca,2:3)
proc0<-function(dt){
for(i in 1:nrow(dt)){
for(j in 1:ncol(dt)){
dt[i,j]=max(dt[i,j],0.5)
}
}
return(dt)
}
clust<-kmeans(as.matrix(log(proc0(df_tf[,col_num])) ),3)
clust
## K-means clustering with 3 clusters of sizes 904, 1098, 456
##
## Cluster means:
## danmakusCount timeDuration totalIncome watchCount interactionCount
## 1 7.961558 0.9322399 6.704376 8.733507 6.470306
## 2 9.593734 1.1970757 8.612388 10.182666 7.929021
## 3 7.494122 0.7317177 4.886031 8.378117 6.059340
## superchatCount superchatIncome membershipCount membershipIncome followers
## 1 1.0970877 4.225044 0.7665128 5.8091397 12.05215
## 2 3.4345804 7.249137 2.2128304 7.5777277 12.78910
## 3 0.6306728 3.046705 -0.6931472 -0.6931472 12.04678
##
## Clustering vector:
## [1] 3 2 1 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 1 3 2 1 2 1 3 1 1 1 3 2 1 1 2 3
## [38] 1 2 2 3 1 3 1 1 2 1 3 1 2 3 3 1 3 3 3 1 1 3 1 1 3 1 2 3 3 1 3 3 1 3 2 1 3
## [75] 1 3 2 1 3 1 1 3 1 1 1 1 1 3 3 1 3 3 1 3 1 1 3 3 1 1 2 2 2 2 2 2 2 2 2 2 2
## [112] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 1 2 2 1 2 1 2 2 3 2 2 3 2 2 1 2 2 1 2 2 2 2 2 1 2 1 2 1 3 2 2 1 2 1 3
## [186] 2 2 3 2 2 2 2 2 1 2 1 3 2 2 2 2 1 3 3 3 2 1 2 1 1 2 2 2 2 1 2 2 1 1 2 3 2
## [223] 2 3 2 2 3 2 1 1 1 1 1 2 1 1 1 3 1 3 2 1 1 1 1 1 2 1 1 3 1 3 3 3 1 3 1 1 1
## [260] 1 1 2 3 1 1 1 1 3 1 1 1 3 1 3 1 3 3 3 3 1 1 1 1 3 1 3 3 1 1 3 1 3 1 1 1 3
## [297] 1 1 1 1 3 1 1 3 2 1 3 3 1 3 1 2 1 3 1 3 1 1 1 2 2 3 1 1 2 1 2 3 2 1 2 2 1
## [334] 1 1 2 2 1 2 2 3 1 3 2 2 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## [371] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 3 3 1 3 1
## [408] 1 1 2 1 3 1 1 1 1 1 1 2 1 1 3 3 3 2 3 1 3 1 1 1 3 1 1 3 1 3 3 1 1 1 1 2 2
## [445] 3 1 3 1 3 1 1 1 3 3 1 1 1 1 1 1 1 1 2 2 2 1 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3
## [482] 1 1 2 1 1 1 2 3 1 1 1 1 1 3 3 2 1 3 1 3 3 3 3 3 1 3 1 3 1 1 3 2 2 3 3 1 3
## [519] 3 3 2 1 3 1 1 1 1 1 3 1 3 1 3 1 1 3 2 3 3 1 1 3 3 1 3 2 1 2 1 1 3 3 1 3 1
## [556] 3 2 2 2 3 3 1 3 2 3 3 1 2 1 2 2 2 1 2 1 2 1 3 2 3 1 3 2 2 1 1 2 1 2 2 2 2
## [593] 1 3 2 3 3 1 1 1 1 2 1 1 3 1 2 1 1 1 2 1 1 3 1 2 2 3 2 1 1 1 1 1 2 1 1 1 3
## [630] 3 2 3 1 3 1 1 1 1 3 2 1 1 2 1 1 1 1 1 3 1 3 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 1 1 1 3 2 1 2 2 2 2 2 3
## [704] 1 1 2 2 1 3 2 2 2 2 2 2 1 1 1 3 1 3 3 2 3 3 3 2 2 1 1 1 2 2 2 1 1 1 1 2 1
## [741] 2 1 1 1 1 3 1 3 1 3 1 2 2 3 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 1 2
## [778] 2 3 1 1 2 2 1 2 1 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 2 1 2 1 1 1 3 1 1
## [815] 1 3 2 3 3 1 3 3 3 3 3 3 3 3 1 2 2 1 2 1 1 1 3 1 2 2 2 2 2 2 1 3 2 1 1 2 2
## [852] 2 2 3 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 3 2 1 2 2 2 2 2 2 3 1 2 2 2 2 2 1 2 2
## [889] 2 2 2 1 2 2 2 1 2 3 2 2 2 2 2 1 3 3 2 2 3 3 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1
## [926] 3 3 3 1 2 3 1 2 1 1 3 1 1 2 3 1 1 1 2 1 1 1 1 3 3 3 3 1 3 1 1 3 2 1 1 3 3
## [963] 1 1 1 3 3 1 1 1 1 1 1 1 1 3 1 3 1 2 1 1 1 1 1 1 1 1 3 1 1 1 2 1 2 1 3 3 3
## [1000] 1 1 1 1 3 3 2 1 3 1 1 2 2 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 3 2 2
## [1037] 2 2 2 2 2 3 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1074] 2 2 2 2 1 1 2 2 2 1 1 2 1 2 2 2 2 1 2 3 1 1 2 2 2 2 2 2 2 2 3 3 2 2 1 2 2
## [1111] 3 2 2 2 2 2 3 1 2 2 2 1 2 2 3 3 2 1 1 1 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1148] 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 2 2 2 2
## [1185] 1 1 1 1 3 2 2 1 1 1 1 1 1 1 1 2 1 1 1 3 3 3 1 1 1 1 1 1 1 1 3 3 1 3 1 3 2
## [1222] 2 1 1 1 3 3 1 3 3 3 3 1 3 2 3 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 2 2 3 1
## [1259] 2 1 3 3 1 1 2 1 1 1 1 1 2 1 3 1 3 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2 2 1
## [1296] 2 2 2 2 3 1 2 2 1 1 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 3 2 2 1 1 2 1 2 2 1 2 1
## [1333] 1 1 3 1 1 3 1 3 1 2 1 3 1 3 1 1 1 1 1 3 3 1 1 1 1 3 3 1 1 2 2 1 1 3 1 3 3
## [1370] 1 1 3 1 1 1 1 3 1 3 3 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2
## [1407] 1 2 2 2 2 3 2 1 2 3 2 1 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2
## [1444] 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 3 2 1 2 3 2
## [1481] 3 2 1 1 1 1 1 2 1 1 2 3 3 1 1 2 1 1 3 3 3 2 2 1 1 1 1 2 1 1 3 3 1 1 3 1 3
## [1518] 3 1 3 1 1 2 1 3 3 1 3 1 1 2 1 3 1 1 1 1 3 1 1 3 3 1 1 3 1 3 1 1 3 1 1 1 1
## [1555] 1 1 1 1 3 1 1 3 1 2 2 1 1 2 1 3 1 3 1 3 1 1 1 3 1 3 1 3 1 2 2 2 1 2 2 2 1
## [1592] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 3 1 3 2 2 1 1 2 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2
## [1629] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1666] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 3 1 1 2 1 1 1 1 3 2 1 3 1 1 1
## [1703] 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 3 1 1 1 1 3 3 1 1 1 1 2 2 1 2 2 2 2 1 1
## [1740] 2 1 1 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 1 2 1 3 1 2 1 2 2 2 3 3 1 3 2 3 2
## [1777] 2 1 1 3 3 1 3 3 3 3 3 1 1 3 1 1 3 3 3 3 1 1 3 1 1 3 1 3 3 1 3 3 3 1 3 3 3
## [1814] 3 1 3 1 3 3 1 1 3 3 1 3 3 1 1 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1851] 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2 2 1 1 1 1 1 1 1 1 3 1 1 3 1 1 3 1
## [1888] 2 1 3 1 1 3 3 3 3 1 3 1 3 3 1 3 3 1 1 3 1 3 1 1 1 1 3 1 1 1 3 3 3 1 3 1 2
## [1925] 1 2 1 1 3 3 1 1 1 2 1 2 1 1 2 2 2 2 3 2 3 3 3 1 1 1 2 2 3 1 2 2 1 2 2 3 2
## [1962] 1 2 1 1 3 1 2 1 2 3 2 3 1 1 2 1 2 3 1 1 2 1 2 1 1 1 1 2 3 2 1 1 2 1 2 1 1
## [1999] 1 2 1 2 1 2 1 3 2 1 1 2 1 2 2 1 3 3 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2
## [2036] 1 2 1 1 3 3 3 1 3 3 3 1 3 3 1 3 1 1 3 3 3 3 2 1 1 1 3 3 1 3 3 3 3 3 3 3 3
## [2073] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 3 1 1 1 1 1 1 3 1 1 3 1 1 3 1 3 1 2 1 1 1
## [2110] 1 1 1 2 3 1 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 2 2 1 1 2 1 1 1 1 3 1 1 1 3
## [2147] 3 1 1 1 1 3 1 3 3 1 1 2 1 2 2 3 1 1 1 2 2 2 2 2 2 3 2 2 2 2 2 3 1 2 2 2 2
## [2184] 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2
## [2221] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2
## [2258] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 1 1 2 2 3 2 2 2 2 2 2 2 2 2
## [2295] 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3
## [2332] 2 2 2 2 3 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
## [2369] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [2406] 2 3 2 2 2 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 1 1 2 2 2 2 2 1 1 1 1 1 1 2
## [2443] 1 3 1 3 1 2 3 3 1 3 3 3 2 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 9313.457 10191.177 9045.927
## (between_SS / total_SS = 62.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
sum(df_tf$membershipCount==0)
## [1] 456
plot(pca$x[,1:2] , col=clust$cluster)
plot(pca$x[,c(1,3)] , col=clust$cluster)
plot(pca$x[,2:3] , col=clust$cluster)
table(df_tf[clust$cluster==1,"weekday"])
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 112 127 130 125 147 137 126
table(df_tf[clust$cluster==2,"weekday"])
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 111 119 143 153 152 226 194
table(df_tf[clust$cluster==3,"weekday"])
##
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 58 69 70 68 65 58 68
(table(df_tf[clust$cluster==1,"affiliation"]))
##
## another-project chaoslive independent meta-mythos providence
## 39 10 231 27 56
## psp vr 虚研社
## 92 400 49
(table(df_tf[clust$cluster==2,"affiliation"]))
##
## another-project chaoslive chucolala independent meta-mythos
## 22 31 50 492 16
## providence psp vr 虚研社
## 18 117 327 25
(table(df_tf[clust$cluster==3,"affiliation"]))
##
## another-project chaoslive independent meta-mythos providence
## 39 9 117 7 26
## psp vr 虚研社
## 37 196 25
df_rf=df_tf
df_rf$liver=factor(df_rf$liver)
df_rf$area=factor(df_rf$area)
df_rf$affiliation=factor(df_rf$affiliation)
df_rf$title=NULL
df_rf$startDate=NULL
df_rf$stopDate=NULL
df_rf$superchatTimestamps=NULL
df_rf$membershipTimestamps=NULL
df_rf$posix_startDate=NULL
df_rf$posix_stopDate=NULL
df_rf$id=NULL
set.seed(123456)
N=nrow(df_rf)
ind_train=sample(1:N,ceiling(N*0.8))
ind_test=(1:N)[-ind_train]
train=df_rf[ind_train,c(3:8,10,12)]
test=df_rf[ind_test,c(3:8,10,12)]
summary(train)
## danmakusCount timeDuration totalIncome watchCount
## Min. : 0 Min. : 0.00233 Min. : 0.0 Min. : 0
## 1st Qu.: 2188 1st Qu.: 2.13029 1st Qu.: 543.5 1st Qu.: 5012
## Median : 5125 Median : 2.79710 Median : 1556.0 Median : 10596
## Mean : 11815 Mean : 3.16621 Mean : 5531.4 Mean : 21457
## 3rd Qu.: 11694 3rd Qu.: 3.94296 3rd Qu.: 4384.3 3rd Qu.: 27057
## Max. :192441 Max. :19.47241 Max. :450768.7 Max. :204768
## interactionCount superchatCount membershipCount followers
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 34583
## 1st Qu.: 495.5 1st Qu.: 2.00 1st Qu.: 1.00 1st Qu.: 115577
## Median : 1082.0 Median : 9.00 Median : 3.00 Median : 263239
## Mean : 2117.1 Mean : 24.53 Mean : 13.32 Mean : 353845
## 3rd Qu.: 2602.0 3rd Qu.: 27.00 3rd Qu.: 8.00 3rd Qu.: 486642
## Max. :24202.0 Max. :1102.00 Max. :2173.00 Max. :1339125
for(i in 1:nrow(train)){
for(j in 1:ncol(train)){
train[i,j]=max(train[i,j],0.1)
}
}
for(i in 1:nrow(test)){
for(j in 1:ncol(test)){
test[i,j]=max(test[i,j],0.1)
}
}
library(randomForest)
set.seed(123456)
rf <- randomForest(totalIncome ~ ., data=log(train),ntree=100,
importance=TRUE, na.action=na.omit)
print(rf)
##
## Call:
## randomForest(formula = totalIncome ~ ., data = log(train), ntree = 100, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 0.5876366
## % Var explained: 84.16
## Show "importance" of variables: higher value mean more important:
round(importance(rf), 2)
## %IncMSE IncNodePurity
## danmakusCount 12.94 1226.67
## timeDuration 6.03 374.10
## watchCount 10.86 774.90
## interactionCount 11.39 851.80
## superchatCount 21.05 1716.66
## membershipCount 33.39 1956.38
## followers 15.63 238.00
lr<-lm(totalIncome ~ ., data=log(train))
summary(lr)
##
## Call:
## lm(formula = totalIncome ~ ., data = log(train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9761 -0.4902 -0.1065 0.4340 4.3495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.06468 0.40714 12.440 < 2e-16 ***
## danmakusCount 0.15097 0.06343 2.380 0.0174 *
## timeDuration 0.20044 0.04165 4.813 1.60e-06 ***
## watchCount 0.05507 0.04231 1.302 0.1932
## interactionCount 0.16599 0.08943 1.856 0.0636 .
## superchatCount 0.31034 0.01707 18.182 < 2e-16 ***
## membershipCount 0.46378 0.01443 32.136 < 2e-16 ***
## followers -0.16234 0.03541 -4.584 4.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9092 on 1959 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7773
## F-statistic: 981.2 on 7 and 1959 DF, p-value: < 2.2e-16
plot(lr)
lr_fit=exp(predict(lr,log(test)))
lr_resid=(test$totalIncome)-(lr_fit)
summary(lr_resid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -48501.02 -723.15 -17.41 762.87 430.99 93792.73
sqrt(mean(lr_resid^2))
## [1] 7311.825
mean(abs(lr_resid))
## [1] 2441.767
mean(abs(lr_resid)/max(test$totalIncome,1))
## [1] 0.008158422
rf_fit=exp(predict(rf,log(test)))
rf_resid=test$totalIncome - rf_fit
summary( rf_resid )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -26669.39 -436.09 -84.67 1600.86 403.42 165570.88
sqrt(mean( rf_resid^2) )
## [1] 10634.32
mean( abs( rf_resid ) )
## [1] 2460.008
test[which.max(abs(rf_resid)),]
## danmakusCount timeDuration totalIncome watchCount interactionCount
## 2227 210878 4.298397 299294 112758 25175
## superchatCount membershipCount followers
## 2227 1724 1006 1339125
mean(abs(rf_resid)/max(test$totalIncome,1))
## [1] 0.008219368
varImpPlot(rf)
plot(rf)
proc0<-function(dt){
for(i in 1:nrow(dt)){
for(j in 1:ncol(dt)){
dt[i,j]=max(dt[i,j],0.5)
}
}
return(dt)
}
df_causal<-df_rf[,c(3:8,10,12)]
df_causal<-proc0(df_causal)
df_causal<-log(df_causal)
df_causal<-as.data.frame(t(t(df_causal)-colMeans(df_causal)) )
df_causal$W1=0
df_causal[df_rf$affiliation=="independent","W1"]=1
df_causal$W2=0
df_causal[(df_rf$weekday=="Sunday")|(df_rf$weekday=="Saturday"),"W2"]=1
df_causal<-df_causal[df_rf$totalIncome>=1,]
df_causal[,1:8]<-as.data.frame(t(t(df_causal[,1:8])-colMeans(df_causal[,1:8])) )
lr<-lm(totalIncome~.-W1-W2,df_causal)
summary(lr)
##
## Call:
## lm(formula = totalIncome ~ . - W1 - W2, data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5720 -0.3943 -0.0898 0.3380 3.7391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.088e-15 1.640e-02 0.000 1.0000
## danmakusCount 1.082e-01 5.177e-02 2.090 0.0368 *
## timeDuration 8.846e-02 3.754e-02 2.356 0.0185 *
## watchCount 2.797e-02 3.970e-02 0.705 0.4811
## interactionCount -2.532e-02 7.501e-02 -0.338 0.7357
## superchatCount 4.250e-01 1.892e-02 22.466 <2e-16 ***
## membershipCount 6.230e-01 1.579e-02 39.469 <2e-16 ***
## followers -2.338e-02 3.093e-02 -0.756 0.4498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.81 on 2432 degrees of freedom
## Multiple R-squared: 0.7793, Adjusted R-squared: 0.7787
## F-statistic: 1227 on 7 and 2432 DF, p-value: < 2.2e-16
plot(lr)
lr0<-lm(totalIncome~W1,df_causal)
summary(lr0)
##
## Call:
## lm(formula = totalIncome ~ W1, data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7086 -0.9311 0.0667 1.0746 5.8536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22415 0.04228 -5.302 1.25e-07 ***
## W1 0.65499 0.07227 9.063 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.694 on 2438 degrees of freedom
## Multiple R-squared: 0.03259, Adjusted R-squared: 0.03219
## F-statistic: 82.13 on 1 and 2438 DF, p-value: < 2.2e-16
with(df_causal, t.test( (totalIncome[W1==1]), (totalIncome[W1==0]) ) )
##
## Welch Two Sample t-test
##
## data: (totalIncome[W1 == 1]) and (totalIncome[W1 == 0])
## t = 9.356, df = 1845.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.5176906 0.7922950
## sample estimates:
## mean of x mean of y
## 0.4308457 -0.2241471
lr1<-lm(totalIncome~(.-W2),df_causal)
summary(lr1)
##
## Call:
## lm(formula = totalIncome ~ (. - W2), data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5519 -0.3914 -0.0925 0.3515 3.8068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0503582 0.0207045 -2.432 0.01508 *
## danmakusCount 0.1321863 0.0519717 2.543 0.01104 *
## timeDuration 0.1199312 0.0382602 3.135 0.00174 **
## watchCount 0.0126697 0.0397688 0.319 0.75007
## interactionCount -0.0666659 0.0755034 -0.883 0.37735
## superchatCount 0.4227338 0.0188677 22.405 < 2e-16 ***
## membershipCount 0.6240051 0.0157398 39.645 < 2e-16 ***
## followers -0.0003863 0.0313809 -0.012 0.99018
## W1 0.1471545 0.0371261 3.964 7.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8075 on 2431 degrees of freedom
## Multiple R-squared: 0.7807, Adjusted R-squared: 0.78
## F-statistic: 1082 on 8 and 2431 DF, p-value: < 2.2e-16
plot(lr1)
lr2<-lm(totalIncome~(.-W2)+W1*(.-W2),df_causal)
summary(lr2)
##
## Call:
## lm(formula = totalIncome ~ (. - W2) + W1 * (. - W2), data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5868 -0.3897 -0.0630 0.3532 3.7454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0296278 0.0206699 -1.433 0.15188
## danmakusCount 0.1022086 0.0614442 1.663 0.09635 .
## timeDuration 0.0111641 0.0465597 0.240 0.81052
## watchCount -0.0009463 0.0602050 -0.016 0.98746
## interactionCount 0.0052127 0.0975093 0.053 0.95737
## superchatCount 0.4539843 0.0224526 20.220 < 2e-16 ***
## membershipCount 0.6297061 0.0191011 32.967 < 2e-16 ***
## followers 0.0285459 0.0418848 0.682 0.49560
## W1 0.2258040 0.0378903 5.959 2.90e-09 ***
## danmakusCount:W1 0.0780596 0.1127683 0.692 0.48887
## timeDuration:W1 0.3491868 0.0820659 4.255 2.17e-05 ***
## watchCount:W1 0.0247559 0.0797540 0.310 0.75628
## interactionCount:W1 -0.2190745 0.1603594 -1.366 0.17202
## superchatCount:W1 -0.1059212 0.0405412 -2.613 0.00904 **
## membershipCount:W1 0.0018722 0.0332638 0.056 0.95512
## followers:W1 -0.1248989 0.0648343 -1.926 0.05417 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.794 on 2424 degrees of freedom
## Multiple R-squared: 0.7886, Adjusted R-squared: 0.7873
## F-statistic: 602.9 on 15 and 2424 DF, p-value: < 2.2e-16
plot(lr2)
lr0<-lm(totalIncome~W2,df_causal)
summary(lr0)
##
## Call:
## lm(formula = totalIncome ~ W2, data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5281 -0.9537 0.0730 1.1194 5.7529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.12350 0.04238 -2.914 0.0036 **
## W2 0.37388 0.07374 5.070 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.713 on 2438 degrees of freedom
## Multiple R-squared: 0.01044, Adjusted R-squared: 0.01003
## F-statistic: 25.71 on 1 and 2438 DF, p-value: 4.268e-07
with(df_causal, t.test( (totalIncome[W2==1]), (totalIncome[W2==0]) ) )
##
## Welch Two Sample t-test
##
## data: (totalIncome[W2 == 1]) and (totalIncome[W2 == 0])
## t = 5.0329, df = 1571.9, p-value = 5.386e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2281671 0.5195881
## sample estimates:
## mean of x mean of y
## 0.2503754 -0.1235022
lr1<-lm(totalIncome~(.-W1),df_causal)
summary(lr1)
##
## Call:
## lm(formula = totalIncome ~ (. - W1), data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5597 -0.3960 -0.0902 0.3331 3.7569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01731 0.02011 -0.861 0.3894
## danmakusCount 0.11002 0.05178 2.125 0.0337 *
## timeDuration 0.08530 0.03759 2.269 0.0233 *
## watchCount 0.02786 0.03969 0.702 0.4828
## interactionCount -0.02264 0.07501 -0.302 0.7628
## superchatCount 0.42241 0.01899 22.244 <2e-16 ***
## membershipCount 0.62154 0.01581 39.303 <2e-16 ***
## followers -0.02392 0.03093 -0.774 0.4393
## W2 0.05240 0.03524 1.487 0.1372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8098 on 2431 degrees of freedom
## Multiple R-squared: 0.7795, Adjusted R-squared: 0.7788
## F-statistic: 1074 on 8 and 2431 DF, p-value: < 2.2e-16
lr2<-lm(totalIncome~(.-W1)+W2*(.-W1),df_causal)
summary(lr2)
##
## Call:
## lm(formula = totalIncome ~ (. - W1) + W2 * (. - W1), data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5367 -0.3932 -0.0862 0.3410 3.7479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01926 0.02015 -0.956 0.3392
## danmakusCount 0.06108 0.06374 0.958 0.3380
## timeDuration 0.10828 0.04736 2.286 0.0223 *
## watchCount 0.03763 0.04950 0.760 0.4472
## interactionCount 0.05618 0.09170 0.613 0.5402
## superchatCount 0.40472 0.02373 17.055 <2e-16 ***
## membershipCount 0.61288 0.01972 31.081 <2e-16 ***
## followers -0.04956 0.03766 -1.316 0.1883
## W2 0.04798 0.03538 1.356 0.1751
## danmakusCount:W2 0.15967 0.10990 1.453 0.1464
## timeDuration:W2 -0.06040 0.07794 -0.775 0.4384
## watchCount:W2 -0.02118 0.08303 -0.255 0.7986
## interactionCount:W2 -0.24872 0.15991 -1.555 0.1200
## superchatCount:W2 0.04424 0.03979 1.112 0.2663
## membershipCount:W2 0.02200 0.03315 0.663 0.5071
## followers:W2 0.07782 0.06619 1.176 0.2398
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8097 on 2424 degrees of freedom
## Multiple R-squared: 0.7802, Adjusted R-squared: 0.7789
## F-statistic: 573.7 on 15 and 2424 DF, p-value: < 2.2e-16
cor(lr2$residuals,df_causal$W2)
## [1] -2.776904e-16
df_rf$otherIncome<-with(df_rf,totalIncome-membershipIncome-superchatIncome)
hist(df_rf$otherIncome/max(df_rf$totalIncome,1))
summary(df_rf$otherIncome/max(df_rf$totalIncome,1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0002351 0.0007771 0.0025227 0.0023349 0.1172686
library(grf)
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]
forest.W <- regression_forest(X, W, tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X, Y, tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
tau.forest <- causal_forest(X, Y, W,
W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.38225 -0.03152 0.19582 0.32277 0.75373 2.53658
average_treatment_effect(tau.forest, target.sample = "overlap",method="TMLE")
## estimate std.err
## 0.2315368 0.1067300
average_treatment_effect(tau.forest, target.sample = "overlap",method="AIPW")
## estimate std.err
## 0.2315368 0.1067300
ggplot(NULL, aes(x=df_causal$totalIncome, y=W.hat)) +
geom_point(aes(color=df_causal$W1))
summary(W.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01375 0.04009 0.07809 0.33909 0.80547 0.99647
ggplot(NULL, aes(W.hat, fill=factor(df_causal$W1) )) +
geom_density(alpha=.5) +
scale_fill_manual(values = c('#999999','#E69F00'))
summary(W.hat-W)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.881539 -0.045613 0.032345 -0.003121 0.055067 0.807624
forest.Y
## GRF forest object of type regression_forest
## Number of trees: 2000
## Number of training samples: 2440
## Variable importance:
## 1 2 3 4 5 6 7
## 0.039 0.003 0.006 0.015 0.333 0.600 0.005
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]
tau.forest <- causal_forest(X, Y, W,
tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions
head(predict(tau.forest,estimate.variance = TRUE))
## predictions variance.estimates debiased.error excess.error
## 1 1.737771 0.7248275 0.01183947 0.0002977976
## 2 1.937831 0.3315386 0.18403453 0.0001180892
## 3 2.086950 0.4820520 0.93942636 0.0008962427
## 4 2.044104 0.6409845 0.07063379 0.0001085127
## 5 2.204244 0.2505762 0.49207595 0.0002021150
## 6 1.751580 0.3623757 0.29835226 0.0011565514
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.28502 -0.02547 0.21464 0.33527 0.75112 2.41368
average_treatment_effect(tau.forest, target.sample = "overlap")
## estimate std.err
## 0.2366983 0.1052452
varimp <- variable_importance(tau.forest)
ranked.vars <- order(varimp, decreasing = TRUE)
# Top 5 variables according to this measure
colnames(X)[ranked.vars[1:7]]
## [1] "followers" "timeDuration" "interactionCount" "danmakusCount"
## [5] "watchCount" "superchatCount" "membershipCount"
#> [1] "financial.autonomy.index" "intention.to.save.index"
#> [3] "family.receives.cash.transfer" "has.computer.with.internet.at.home"
#> [5] "is.female"
best_linear_projection(tau.forest, X)
## Warning in best_linear_projection(tau.forest, X): Estimated treatment
## propensities take values between 0.011 and 0.998 and in particular get very
## close to 0 or 1.
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.361159 0.020051 18.0120 < 2.2e-16 ***
## danmakusCount 0.002989 0.063475 0.0471 0.9624
## timeDuration 0.420571 0.050931 8.2577 2.407e-16 ***
## watchCount -0.047197 0.061924 -0.7622 0.4460
## interactionCount 0.014608 0.102956 0.1419 0.8872
## superchatCount -0.218872 0.023447 -9.3348 < 2.2e-16 ***
## membershipCount 0.041215 0.019364 2.1285 0.0334 *
## followers -0.281476 0.045170 -6.2315 5.429e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
best_linear_projection(tau.forest, X)
## Warning in best_linear_projection(tau.forest, X): Estimated treatment
## propensities take values between 0.011 and 0.998 and in particular get very
## close to 0 or 1.
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.361159 0.020051 18.0120 < 2.2e-16 ***
## danmakusCount 0.002989 0.063475 0.0471 0.9624
## timeDuration 0.420571 0.050931 8.2577 2.407e-16 ***
## watchCount -0.047197 0.061924 -0.7622 0.4460
## interactionCount 0.014608 0.102956 0.1419 0.8872
## superchatCount -0.218872 0.023447 -9.3348 < 2.2e-16 ***
## membershipCount 0.041215 0.019364 2.1285 0.0334 *
## followers -0.281476 0.045170 -6.2315 5.429e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# See if a causal forest succeeded in capturing heterogeneity by plotting
# the TOC and calculating a 95% CI for the AUTOC.
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]
n=nrow(df_causal)
train <- sample(1:n, n / 2)
forest.W <- regression_forest(X[train,], W[train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X[train,], Y[train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
train.forest <- causal_forest(X[train, ], Y[train], W[train], W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all")
forest.W <- regression_forest(X[-train,], W[-train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X[-train,], Y[-train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
eval.forest <- causal_forest(X[-train, ], Y[-train], W[-train], W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all")
rate <- rank_average_treatment_effect(eval.forest,
predict(train.forest, X[-train, ])$predictions)
plot(rate)
paste("AUTOC:", round(rate$estimate, 2), "+/", round(1.96 * rate$std.err, 2))
## [1] "AUTOC: 0.33 +/ 0.06"
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]
forest.W <- regression_forest(X, W, tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X, Y, tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
tau.forest <- causal_forest(X, Y, W,
W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions
head(predict(tau.forest,estimate.variance = TRUE))
## predictions variance.estimates debiased.error excess.error
## 1 -0.024784535 0.02483971 0.06838806 0.0002614504
## 2 -0.007747133 0.01043775 0.30354459 0.0002241894
## 3 -0.042947017 0.02041626 0.02119088 0.0002437538
## 4 -0.029243547 0.03527779 0.17953407 0.0002984242
## 5 0.001676725 0.02149095 0.14924601 0.0003682184
## 6 0.031699562 0.01628176 0.01910503 0.0001907937
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.27713 0.01919 0.07443 0.06096 0.11186 0.22084
average_treatment_effect(tau.forest, target.sample = "overlap")
## estimate std.err
## 0.06347278 0.03007109
ggplot(NULL, aes(x=df_causal$totalIncome, y=W.hat)) +
geom_point(aes(color=df_causal$W2))
summary(W.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1929 0.2895 0.3202 0.3299 0.3557 0.6390
ggplot(NULL, aes(W.hat, fill=factor(df_causal$W2) )) +
geom_density(alpha=.5) +
scale_fill_manual(values = c('#999999','#E69F00'))
summary(W.hat-W)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.8070922 -0.6077922 0.2855493 -0.0004587 0.3309460 0.6390002
forest.Y
## GRF forest object of type regression_forest
## Number of trees: 2000
## Number of training samples: 2440
## Variable importance:
## 1 2 3 4 5 6 7
## 0.039 0.003 0.006 0.015 0.333 0.600 0.005
varimp <- variable_importance(tau.forest)
ranked.vars <- order(varimp, decreasing = TRUE)
# Top 5 variables according to this measure
colnames(X)[ranked.vars[1:7]]
## [1] "danmakusCount" "timeDuration" "watchCount" "followers"
## [5] "interactionCount" "superchatCount" "membershipCount"
#> [1] "financial.autonomy.index" "intention.to.save.index"
#> [3] "family.receives.cash.transfer" "has.computer.with.internet.at.home"
#> [5] "is.female"
best_linear_projection(tau.forest, X)
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.055731 0.030959 1.8002 0.07196 .
## danmakusCount 0.057955 0.112002 0.5174 0.60489
## timeDuration -0.021498 0.090219 -0.2383 0.81167
## watchCount -0.110506 0.140170 -0.7884 0.43056
## interactionCount -0.016864 0.202127 -0.0834 0.93351
## superchatCount 0.060470 0.037018 1.6335 0.10249
## membershipCount 0.016470 0.025834 0.6375 0.52385
## followers 0.052154 0.071730 0.7271 0.46724
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]
tau.forest <- causal_forest(X, Y, W,
tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions
head(predict(tau.forest,estimate.variance = TRUE))
## predictions variance.estimates debiased.error excess.error
## 1 -0.0164990427 0.02844612 0.05402773 0.0005632331
## 2 0.0002077488 0.12623932 0.18840197 0.0004624321
## 3 -0.0488933046 0.06579101 0.14049751 0.0006234160
## 4 0.0088083348 0.23757168 0.04202226 0.0010338187
## 5 0.0014174466 0.23620999 0.28105350 0.0015134367
## 6 0.0359870202 0.07364137 0.01752424 0.0004741497
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.426036 0.006774 0.075757 0.064282 0.127471 0.342546
average_treatment_effect(tau.forest, target.sample = "overlap")
## estimate std.err
## 0.06423604 0.03012605
# See if a causal forest succeeded in capturing heterogeneity by plotting
# the TOC and calculating a 95% CI for the AUTOC.
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]
n=nrow(df_causal)
train <- sample(1:n, n / 2)
forest.W <- regression_forest(X[train,], W[train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X[train,], Y[train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
train.forest <- causal_forest(X[train, ], Y[train], W[train], W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all")
forest.W <- regression_forest(X[-train,], W[-train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions
forest.Y <- regression_forest(X[-train,], Y[-train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions
eval.forest <- causal_forest(X[-train, ], Y[-train], W[-train], W.hat = W.hat, Y.hat = Y.hat,
tune.parameters = "all")
rate <- rank_average_treatment_effect(eval.forest,
predict(train.forest, X[-train, ])$predictions)
plot(rate)
paste("AUTOC:", round(rate$estimate, 2), "+/", round(1.96 * rate$std.err, 2))
## [1] "AUTOC: 0.01 +/ 0.09"
library(MatchIt)
m.out <- matchit(W1 ~ danmakusCount+timeDuration+watchCount+interactionCount+superchatCount+membershipCount+followers, data = df_causal,
method = "full", estimand = "ATE")
summary(m.out)
##
## Call:
## matchit(formula = W1 ~ danmakusCount + timeDuration + watchCount +
## interactionCount + superchatCount + membershipCount + followers,
## data = df_causal, method = "full", estimand = "ATE")
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.4422 0.2902 0.8810 1.4141
## danmakusCount 0.2969 -0.1545 0.3814 1.0152
## timeDuration -0.0784 0.0408 -0.2258 0.9465
## watchCount 0.3887 -0.2022 0.5180 0.8011
## interactionCount 0.3669 -0.1909 0.5308 0.7993
## superchatCount 0.4188 -0.2179 0.3990 1.0045
## membershipCount 0.2352 -0.1223 0.2602 0.9848
## followers 0.1631 -0.0849 0.2754 0.8238
## eCDF Mean eCDF Max
## distance 0.2293 0.3282
## danmakusCount 0.1170 0.1898
## timeDuration 0.0678 0.1306
## watchCount 0.1551 0.2387
## interactionCount 0.1578 0.2653
## superchatCount 0.0653 0.2010
## membershipCount 0.0196 0.1243
## followers 0.1293 0.3224
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.3422 0.3414 0.0048 1.0281
## danmakusCount 0.0449 -0.0072 0.0441 0.9499
## timeDuration 0.0078 -0.0290 0.0697 1.0163
## watchCount 0.0874 -0.0076 0.0832 0.7663
## interactionCount 0.0545 0.0009 0.0510 0.7938
## superchatCount 0.0049 -0.0243 0.0183 1.0547
## membershipCount 0.0248 -0.0131 0.0276 0.8481
## followers 0.1042 0.0415 0.0696 0.9074
## eCDF Mean eCDF Max Std. Pair Dist.
## distance 0.0012 0.0131 0.0116
## danmakusCount 0.0307 0.0613 1.0118
## timeDuration 0.0278 0.0682 1.1303
## watchCount 0.0539 0.1164 0.9251
## interactionCount 0.0500 0.1007 0.9408
## superchatCount 0.0133 0.0730 1.0292
## membershipCount 0.0143 0.0363 1.0896
## followers 0.0784 0.2197 1.2585
##
## Sample Sizes:
## Control Treated
## All 1605. 835.
## Matched (ESS) 1008.56 379.95
## Matched 1605. 835.
## Unmatched 0. 0.
## Discarded 0. 0.
plot(summary(m.out))
m.data <- match.data(m.out)
head(m.data)
## danmakusCount timeDuration totalIncome watchCount interactionCount
## 1 -0.7696378 0.09821249 0.1919227 -1.0381997 -0.9292886
## 2 -0.4105296 0.16726665 2.0330875 -0.8488732 -0.6023242
## 3 -0.9621366 -0.16904624 0.9786163 -1.2253391 -0.9662896
## 4 -0.3394832 0.39824559 1.7372021 -0.9497793 -0.7233869
## 5 -1.4040206 1.54300502 -0.7764418 -0.7478239 -1.2056391
## 6 -1.3862329 -0.28382835 0.1947693 -1.4512956 -1.3188552
## superchatCount membershipCount followers W1 W2 distance weights subclass
## 1 0.6975887 -1.84848200 -1.93097 1 0 0.4085820 0.6844262 1
## 2 0.4099066 1.24256045 -1.93097 1 0 0.4231426 0.6844262 99
## 3 -0.1290899 1.04188975 -1.93097 1 0 0.4381860 0.6844262 204
## 4 0.8694390 0.63642465 -1.93097 1 1 0.3026124 0.6844262 300
## 5 -0.9763877 -0.05672253 -1.93097 1 1 0.1363506 4.7909836 393
## 6 0.2275851 -0.46218764 -1.93097 1 0 0.4233618 0.6844262 478
library("marginaleffects")
set.seed(114514)
X=m.data[,c(1:2,4:8)]
Y=m.data[,"totalIncome"]
W=m.data[,"W1"]
tau.forest <- causal_forest(X, Y, W,
tune.parameters = "all",sample.weights = m.data$weights
)
tau.hat <- predict(tau.forest)$predictions
head(predict(tau.forest,estimate.variance = TRUE))
## predictions variance.estimates debiased.error excess.error
## 1 2.732758 2.499211 0.71491608 0.0005750609
## 2 2.460348 3.777838 0.20360405 0.0003453791
## 3 2.908501 1.588693 0.60327193 0.0018188093
## 4 2.965362 1.774319 0.04601626 0.0004729487
## 5 2.942272 1.088220 0.58561488 0.0007510384
## 6 2.968560 1.680967 0.65850546 0.0019462661
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.24521 -0.08183 0.25090 0.44143 0.91377 4.28008
average_treatment_effect(tau.forest, target.sample = "all")
## Warning in average_treatment_effect(tau.forest, target.sample = "all"):
## Estimated treatment propensities take values between 0.01 and 0.997
## and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
## estimate std.err
## 0.48884385 0.03863781
average_treatment_effect(tau.forest, target.sample = "treated")
## Warning in average_treatment_effect(tau.forest, target.sample =
## "treated"): Estimated treatment propensities take values between 0.01 and
## 0.997 and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
## estimate std.err
## 0.44586335 0.09154942
average_treatment_effect(tau.forest, target.sample = "control")
## Warning in average_treatment_effect(tau.forest, target.sample =
## "control"): Estimated treatment propensities take values between 0.01 and
## 0.997 and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
## estimate std.err
## 0.54881240 0.07219962
average_treatment_effect(tau.forest, target.sample = "overlap")
## estimate std.err
## 0.3094175 0.1578538
library("marginaleffects")
fit <- lm(totalIncome~(.-W2-distance-weights-subclass)+W1*(.-W2-distance-weights-subclass),data = m.data, weights = weights)
avg_comparisons(fit,
variables = "W1",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W1 1 - 0 0.217 0.0483 4.49 <0.001 0.122 0.311
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
fit <- lm(totalIncome~(.-W2-distance-weights-subclass),data = m.data, weights = weights)
avg_comparisons(fit,
variables = "W1",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W1 1 - 0 0.213 0.0545 3.91 <0.001 0.106 0.32
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
fit <- lm(totalIncome~W1,data = m.data, weights = weights)
avg_comparisons(fit,
variables = "W1",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W1 1 - 0 0.259 0.0954 2.72 0.00655 0.0724 0.446
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library(MatchIt)
m.out2 <- matchit(W2 ~ danmakusCount+timeDuration+watchCount+interactionCount+superchatCount+membershipCount+followers, data = df_causal,
method = "full", estimand = "ATE")
summary(m.out2)
##
## Call:
## matchit(formula = W2 ~ danmakusCount + timeDuration + watchCount +
## interactionCount + superchatCount + membershipCount + followers,
## data = df_causal, method = "full", estimand = "ATE")
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.3456 0.3228 0.3228 1.3382
## danmakusCount 0.0595 -0.0294 0.0742 0.9682
## timeDuration 0.0484 -0.0239 0.1351 1.0882
## watchCount 0.0298 -0.0147 0.0373 1.0158
## interactionCount 0.0323 -0.0159 0.0440 0.9614
## superchatCount 0.2064 -0.1018 0.1903 1.0208
## membershipCount 0.1883 -0.0929 0.2025 1.0784
## followers -0.0108 0.0053 -0.0173 1.0740
## eCDF Mean eCDF Max
## distance 0.0881 0.1512
## danmakusCount 0.0211 0.0444
## timeDuration 0.0427 0.0801
## watchCount 0.0129 0.0374
## interactionCount 0.0099 0.0363
## superchatCount 0.0296 0.0860
## membershipCount 0.0174 0.1022
## followers 0.0147 0.0340
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.3303 0.3303 0.0006 1.0008
## danmakusCount 0.0369 0.0055 0.0263 0.9822
## timeDuration 0.0254 -0.0005 0.0483 1.0391
## watchCount 0.0561 0.0057 0.0422 1.1170
## interactionCount 0.0485 0.0061 0.0386 0.9634
## superchatCount 0.0382 -0.0038 0.0260 1.0051
## membershipCount -0.0123 0.0221 -0.0248 0.8547
## followers 0.0553 0.0066 0.0526 1.0594
## eCDF Mean eCDF Max Std. Pair Dist.
## distance 0.0008 0.0053 0.0048
## danmakusCount 0.0106 0.0364 1.1241
## timeDuration 0.0223 0.0535 0.9996
## watchCount 0.0165 0.0454 1.1424
## interactionCount 0.0100 0.0325 1.1534
## superchatCount 0.0054 0.0217 0.8874
## membershipCount 0.0131 0.0365 0.8121
## followers 0.0210 0.0509 1.1745
##
## Sample Sizes:
## Control Treated
## All 1634. 806.
## Matched (ESS) 1409.12 564.08
## Matched 1634. 806.
## Unmatched 0. 0.
## Discarded 0. 0.
plot(summary(m.out2))
m.data2 <- match.data(m.out2)
head(m.data2)
## danmakusCount timeDuration totalIncome watchCount interactionCount
## 1 -0.7696378 0.09821249 0.1919227 -1.0381997 -0.9292886
## 2 -0.4105296 0.16726665 2.0330875 -0.8488732 -0.6023242
## 3 -0.9621366 -0.16904624 0.9786163 -1.2253391 -0.9662896
## 4 -0.3394832 0.39824559 1.7372021 -0.9497793 -0.7233869
## 5 -1.4040206 1.54300502 -0.7764418 -0.7478239 -1.2056391
## 6 -1.3862329 -0.28382835 0.1947693 -1.4512956 -1.3188552
## superchatCount membershipCount followers W1 W2 distance weights subclass
## 1 0.6975887 -1.84848200 -1.93097 1 0 0.3683040 1.3393443 632
## 2 0.4099066 1.24256045 -1.93097 1 0 0.4205506 0.8036066 580
## 3 -0.1290899 1.04188975 -1.93097 1 0 0.4030760 0.8928962 84
## 4 0.8694390 0.63642465 -1.93097 1 1 0.4472629 0.4129098 1
## 5 -0.9763877 -0.05672253 -1.93097 1 1 0.4723792 0.4404372 108
## 6 0.2275851 -0.46218764 -1.93097 1 0 0.4043168 1.3393443 241
library("marginaleffects")
fit <- lm(totalIncome~(.-W1-distance-weights-subclass)+W2*(.-W1-distance-weights-subclass),data = m.data2, weights = weights)
avg_comparisons(fit,
variables = "W2",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W2 1 - 0 0.0213 0.0402 0.53 0.596 -0.0575 0.1
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
fit <- lm(totalIncome~(.-W1-distance-weights-subclass),data = m.data2, weights = weights)
avg_comparisons(fit,
variables = "W2",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W2 1 - 0 0.0206 0.0404 0.51 0.61 -0.0586 0.0999
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
fit <- lm(totalIncome~W2,data = m.data2, weights = weights)
avg_comparisons(fit,
variables = "W2",
vcov = ~subclass,
wts = "weights")
##
## Term Contrast Estimate Std. Error z Pr(>|z|) 2.5 % 97.5 %
## W2 1 - 0 0.0216 0.0694 0.311 0.755 -0.114 0.158
##
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
set.seed(114514)
X=m.data2[,c(1:2,4:8)]
Y=m.data2[,"totalIncome"]
W=m.data2[,"W2"]
tau.forest <- causal_forest(X, Y, W,
tune.parameters = "all",sample.weights = m.data2$weights
)
tau.hat <- predict(tau.forest)$predictions
head(predict(tau.forest,estimate.variance = TRUE))
## predictions variance.estimates debiased.error excess.error
## 1 0.01286693 0.007477905 0.192365005 8.210039e-05
## 2 0.07368682 0.005873541 0.481168133 7.265610e-05
## 3 0.01732016 0.001862600 0.001972586 5.948801e-05
## 4 0.05092729 0.010353448 0.327539426 1.260941e-04
## 5 -0.02185859 0.035880638 0.054570047 1.607373e-04
## 6 0.09250286 0.008440955 0.150049071 6.706317e-05
summary(tau.hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.23021 -0.03187 0.02716 0.01667 0.07294 0.22168
average_treatment_effect(tau.forest, target.sample = "all")
## estimate std.err
## 0.01634060 0.03535471
average_treatment_effect(tau.forest, target.sample = "treated")
## estimate std.err
## 0.02101583 0.03467628
average_treatment_effect(tau.forest, target.sample = "control")
## estimate std.err
## 0.01403770 0.03631965
average_treatment_effect(tau.forest, target.sample = "overlap")
## estimate std.err
## 0.01789833 0.03487897
summary( lm(totalIncome~(.),data=df_causal) )
##
## Call:
## lm(formula = totalIncome ~ (.), data = df_causal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5903 -0.3885 -0.0908 0.3485 3.8241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0671977 0.0236867 -2.837 0.00459 **
## danmakusCount 0.1339164 0.0519730 2.577 0.01003 *
## timeDuration 0.1167450 0.0383133 3.047 0.00234 **
## watchCount 0.0125995 0.0397595 0.317 0.75135
## interactionCount -0.0639256 0.0755090 -0.847 0.39730
## superchatCount 0.4202298 0.0189408 22.186 < 2e-16 ***
## membershipCount 0.6225290 0.0157685 39.479 < 2e-16 ***
## followers -0.0009806 0.0313762 -0.031 0.97507
## W1 0.1467601 0.0371184 3.954 7.91e-05 ***
## W2 0.0513868 0.0351373 1.462 0.14375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8073 on 2430 degrees of freedom
## Multiple R-squared: 0.7809, Adjusted R-squared: 0.7801
## F-statistic: 962.5 on 9 and 2430 DF, p-value: < 2.2e-16
plot( lm(totalIncome~(.),data=df_causal) )